
"""
QC_Bench_Main_Experiments.py

"""

# QC-Bench Core Evaluation Script
# Files needed to run:
# - qc200.json, qc1000.json, qc5184.json (for benchmark subsets)
# - basic_concepts.json, gates_and_circuit_design.json, qml.json, security.json,
#   error_correction.json, algorithms.json, distributed_computing.json (for topic benchmarks)

import os
import re
import time
import json
from collections import defaultdict

import requests
import openai
from anthropic import Anthropic
from google import genai
from google.genai import types

# Set your API keys here
os.environ["OPENAI_API_KEY"] = "Key1"
os.environ["ANTHROPIC_API_KEY"] = "Key1"
os.environ["GROQ_API_KEY"] = "Key1"
os.environ["GOOGLE_API_KEY"] = "Key1"

# Initialize clients
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
client = genai.Client(api_key=GEMINI_API_KEY)
openai.api_key = os.environ["OPENAI_API_KEY"]
anthropic_client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])

# -------------------------------
# USER CONFIGURATION SECTION
# -------------------------------

# Uncomment one of these to select benchmark type
topics_to_benchmark = ["qc200", "qc1000"]  # Benchmark by dataset size
# topics_to_benchmark = [                 # Or benchmark by topic category
#     "basic_concepts",
#     "gates_and_circuit_design",
#     "qml",
#     "security",
#     "error_correction",
#     "algorithms",
#     "distributed_computing"
# ]

# Labels for reporting
topic_labels = {
    # Subset labels
    "qc200": "200 Questions",
    "qc1000": "1000 Questions",
    "qc5184": "5184 Questions",
    # Topic labels
    "basic_concepts": "Basic Concepts",
    "gates_and_circuit_design": "Gates & Circuit Design",
    "qml": "Quantum Machine Learning",
    "security": "Quantum Security",
    "error_correction": "Error Correction",
    "algorithms": "Quantum Algorithms",
    "distributed_computing": "Distributed Computing",
}

miss_threshold = 12  # Questions missed by this many models will be reported

# -------------------------------
# MODEL CONFIGURATION
# -------------------------------

model_configs = {
    "openai": {"models": {
        "gpt-4-1106-preview": "GPT-4.1",
        "gpt-4-turbo-2024-04-09": "GPT-4.1 mini",
        "gpt-3.5-turbo-0125": "GPT-4.1 nano",
        "gpt-4o": "GPT-4o",
        "gpt-4o-mini-2024-07-18": "GPT-4o-mini",
    }},
    "anthropic": {"models": {
        "claude-3-7-sonnet-20250219": "Claude 3.7 Sonnet",
        "claude-3-5-sonnet-20241022": "Claude 3.5 Sonnet",
        "claude-3-haiku-20240307": "Claude 3.5 Haiku",
    }},
    "groq": {"models": {
        "llama3-70b-8192": "llama3-70b",
        "llama-3.3-70b-versatile": "llama-3.3-70b-versatile",
        "gemma2-9b-it": "gemma2-9b-it",
    }},
    "gemini": {"models": {
        "gemini-1.5-pro-latest": "Gemini 1.5 Pro",
        "gemini-2.0-flash": "Gemini 2.0 Flash",
    }},
}

# -------------------------------
# BENCHMARK EXECUTION
# -------------------------------

all_results = {}
missed_by = defaultdict(list)

for topic in topics_to_benchmark:
    with open(f"{topic}.json", "r", encoding="utf-8") as f:
        data = json.load(f)

    label = topic_labels.get(topic, topic)
    print(f"\n=== Benchmarking: {label} ===")
    total = len(data)
    results = {}

    # ---------- OpenAI ----------
    for model_id, label_id in model_configs["openai"]["models"].items():
        correct = 0
        for i, q in enumerate(data):
            prompt = f"""You are an expert in quantum computing. Choose the correct answer to the question below.

Question: {q['question']}
A. {q['A']}
B. {q['B']}
C. {q['C']}
D. {q['D']}

Answer with only one letter: A, B, C, or D.

Answer:"""
            try:
                resp = openai.ChatCompletion.create(
                    model=model_id,
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0.0,
                    max_tokens=5,
                )
                content = resp.choices[0].message.content.strip().upper()
                pred = re.search(r"[A-D]", content)
                predicted = pred.group(0) if pred else "?"
                if predicted == q["solution"]:
                    correct += 1
                else:
                    missed_by[i].append((label_id, predicted))
            except Exception as e:
                print(f"OpenAI error ({label_id}): {e}")
        acc = correct / total
        results[label_id] = acc
        print(f" {label_id} accuracy: {acc:.2%}")

    # --------- Anthropic ---------
    for model_id, label_id in model_configs["anthropic"]["models"].items():
        correct = 0
        for i, q in enumerate(data):
            prompt = f"""You are an expert in quantum computing. Choose the correct answer to the question below.

Question: {q['question']}
A. {q['A']}
B. {q['B']}
C. {q['C']}
D. {q['D']}

Answer with only one letter: A, B, C, or D.

Answer:"""
            try:
                resp = anthropic_client.messages.create(
                    model=model_id,
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0.0,
                    max_tokens=5,
                )
                content = resp.content[0].text.strip().upper()
                pred = re.search(r"[A-D]", content)
                predicted = pred.group(0) if pred else "?"
                if predicted == q["solution"]:
                    correct += 1
                else:
                    missed_by[i].append((label_id, predicted))
            except Exception as e:
                print(f"Anthropic error ({label_id}): {e}")
        acc = correct / total
        results[label_id] = acc
        print(f" {label_id} accuracy: {acc:.2%}")

    # ------------ Groq -----------
    for model_id, label_id in model_configs["groq"]["models"].items():
        correct = 0
        for i, q in enumerate(data):
            prompt = f"""You are an expert in quantum computing. Choose the correct answer to the question below.

Question: {q['question']}
A. {q['A']}
B. {q['B']}
C. {q['C']}
D. {q['D']}

Answer with only one letter: A, B, C, or D.

Answer:"""
            try:
                resp = requests.post(
                    "https://api.groq.com/openai/v1/chat/completions",
                    headers={
                        "Authorization": f"Bearer {os.environ['GROQ_API_KEY']}",
                        "Content-Type": "application/json",
                    },
                    json={
                        "model": model_id,
                        "messages": [{"role": "user", "content": prompt}],
                        "temperature": 0.0,
                        "max_tokens": 5,
                    },
                ).json()
                if "choices" not in resp:
                    continue
                content = resp["choices"][0]["message"]["content"].strip().upper()
                pred = re.search(r"[A-D]", content)
                predicted = pred.group(0) if pred else "?"
                if predicted == q["solution"]:
                    correct += 1
                else:
                    missed_by[i].append((label_id, predicted))
                time.sleep(0.6)
            except Exception as e:
                print(f"Groq error ({label_id}): {e}")
        acc = correct / total
        results[label_id] = acc
        print(f" {label_id} accuracy: {acc:.2%}")

    # ----------- Gemini ----------
    for model_id, label_id in model_configs["gemini"]["models"].items():
        correct = 0
        for i, q in enumerate(data):
            prompt = f"""You are an expert in quantum computing. Choose the correct answer to the question below.

Question: {q['question']}
A. {q['A']}
B. {q['B']}
C. {q['C']}
D. {q['D']}

Answer with only one letter: A, B, C, or D.

Answer:"""
            try:
                resp = client.models.generate_content(
                    model=model_id,
                    contents=prompt,
                    config=types.GenerateContentConfig(
                        temperature=0.0,
                        max_output_tokens=5,
                    ),
                )
                content = resp.text.strip().upper()
                pred = re.search(r"[A-D]", content)
                predicted = pred.group(0) if pred else "?"
                if predicted == q["solution"]:
                    correct += 1
                else:
                    missed_by[i].append((label_id, predicted))
            except Exception as e:
                print(f"Gemini error ({label_id}): {e}")
        acc = correct / total
        results[label_id] = acc
        print(f" {label_id} accuracy: {acc:.2%}")

    all_results[topic] = results

# -------------------------------
# FINAL REPORT
# -------------------------------

print("\n=== FINAL ACCURACY REPORT ===")
for topic, topic_results in all_results.items():
    print(f"\nSubset: {topic_labels.get(topic, topic)}")
    for model, acc in sorted(topic_results.items(), key=lambda x: x[1], reverse=True):
        print(f"{model}: {acc:.2%}")

print(f"\n=== Questions Missed by ≥ {miss_threshold} Models ===")
for q_idx, models_missed in missed_by.items():
    if len(models_missed) >= miss_threshold:
        question_text = data[q_idx]["question"]
        correct_answer = data[q_idx]["solution"]
        print(f"\nQ{q_idx+1} missed by {len(models_missed)} models: {question_text}")
        print(f"Correct Answer: {correct_answer}")
        model_list = ", ".join([f"{m}({ans})" for m, ans in models_missed])
        print(f"Missed by: {model_list}")

# Export results to JSON
with open("benchmark_results.json", "w") as f:
    json.dump(all_results, f, indent=2)
print("\nResults saved to benchmark_results.json")
